1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 package org.smartcrawler.extractor;
28
29 import java.nio.ByteBuffer;
30 import java.nio.CharBuffer;
31 import java.nio.charset.Charset;
32 import java.nio.charset.CharsetDecoder;
33 import java.util.Vector;
34 import java.util.regex.Matcher;
35 import java.util.regex.Pattern;
36 import org.apache.log4j.Logger;
37 import org.smartcrawler.extractor.pattern.AbstractPattern;
38 import org.smartcrawler.retriever.Content;
39 import org.smartcrawler.common.Link;
40 import org.smartcrawler.common.SCLogger;
41
42
43 /***
44 *
45 *
46 * @author <a href="mailto:pozzad@alice.it">Davide Pozza</a>
47 * @version <tt>$Revision: 1.8 $</tt>
48 */
49 public class RegExpLinksExtractor implements LinksExtractor {
50
51 private static Logger log = SCLogger.getLogger(RegExpLinksExtractor.class);
52 private static Logger logExtr = SCLogger.getExtractorLogger();
53
54 /*** Standard charset.*/
55 private final static Charset charset = Charset.forName("ISO-8859-15");
56
57 /*** Standard charset decoder.*/
58 private final static CharsetDecoder decoder = charset.newDecoder();
59
60 /*** The list of the extraction patterns. */
61 private static AbstractPattern[] apatList =
62 PatternProvider.instance().getPatterns();
63
64 private LinkBuilder linkBuilder;
65
66 /***
67 * Creates a new instance of RegExpLinksExtractor
68 * @param parsedPageLink
69 */
70 public RegExpLinksExtractor(Link parsedPageLink) {
71 this.linkBuilder = new LinkBuilderImpl(parsedPageLink);
72 }
73
74 /***
75 *
76 * @param content
77 * @return
78 */
79 public Link[] extract(Content content) {
80 log.debug("extractLinks(): BEGIN");
81 if (content.getContentType().indexOf("htm") < 0){
82 return new Link[0];
83 }
84 byte[] buffer = content.getBuffer();
85 ByteBuffer bbuf = ByteBuffer.allocate(buffer.length);
86 bbuf.put(buffer);
87 bbuf.flip();
88
89 Vector<Link> vect = new Vector<Link>();
90 try {
91 CharBuffer charBuf = decoder.decode(bbuf);
92
93 for (AbstractPattern apat : apatList) {
94 log.debug("extractLinks(): [" + apat.getClass().getName()
95 + "] checking pattern " + apat.getPattern());
96 Pattern p = apat.getPattern();
97
98 Matcher matcher = p.matcher(charBuf);
99 while(matcher.find()) {
100 CharSequence cs = matcher.group(apat.getGroupAsInt());
101
102 HtmlURL htmlURL = new HtmlURLImpl(cs.toString());
103 logExtr.info(apat.getClass().getName() +
104 " " + content.getLink() +
105 " " + htmlURL.getCleanedLinkAsString());
106
107 Link newLink = linkBuilder.buildLink(htmlURL);
108
109 if (newLink != null && !vect.contains(newLink)) {
110 vect.add(newLink);
111 log.debug("extractLinks(): adding link " + newLink);
112 }
113 }
114 }
115 }catch(Exception e) {
116 log.error("extractLinks(): Error extracting links.", e);
117 }
118 log.debug("extractLinks(): found " + vect.size() + " links");
119 Link[] res = new Link[vect.size()];
120 vect.copyInto(res);
121 bbuf.clear();
122
123 log.debug("extractLinks(): END");
124 return res;
125
126 }
127 }